Setup


In [7]:
import tsvopener
import pandas as pd
import numpy as np
from nltk import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from scipy.sparse import csr_matrix, vstack
from sklearn.semi_supervised import LabelPropagation, LabelSpreading




regex_categorized = tsvopener.open_tsv("categorized.tsv")
human_categorized = tsvopener.open_tsv("human_categorized.tsv")

# Accuracy Check
#
# match = 0
# no_match = 0
# for key in human_categorized:
#     if human_categorized[key] == regex_categorized[key]:
#         match += 1
#     else:
#         no_match += 1
# 
# print("accuracy of regex data in {} human-categorized words".format(
#             len(human_categorized)))
# print(match/(match+no_match))
# 
# accuracy of regex data in 350 human-categorized words
# 0.7857142857142857

Prepare Vectors


In [8]:
# set up targets for the human-categorized data
targets = pd.DataFrame.from_dict(human_categorized, 'index')
targets[0] = pd.Categorical(targets[0])
targets['code'] = targets[0].cat.codes
# form: | word (label) | language | code (1-5)

tmp_dict = {}
for key in human_categorized:
    tmp_dict[key] = tsvopener.etymdict[key]
supervised_sents = pd.DataFrame.from_dict(tmp_dict, 'index')

all_sents = pd.DataFrame.from_dict(tsvopener.etymdict, 'index')
vectorizer = CountVectorizer(stop_words='english', max_features=10000)
all_sents.index.get_loc("anyways (adv.)")


Out[8]:
36478

In [9]:
# vectorize the unsupervised vectors.

vectors = vectorizer.fit_transform(all_sents.values[:,0])

print(vectors.shape)
# supervised_vectors = vectorizer.fit_transform(supervised_data.values[:,0])


(45723, 10000)

In [10]:
# add labels 

# initialize to -1
all_sents['code'] = -1


supervised_vectors = csr_matrix((len(human_categorized),
                                 vectors.shape[1]), 
                                dtype=vectors.dtype)

j = 0
for key in supervised_sents.index:
    all_sents.loc[key]['code'] = targets.loc[key]['code']
    i = all_sents.index.get_loc(key)
    supervised_vectors[j] = vectors[i]
    j += 1


    
# supervised_vectors = csr_matrix((len(human_categorized),
#                                  unsupervised_vectors.shape[1]), 
#                                 dtype=unsupervised_vectors.dtype)

# j = 0
# for key in supervised_data.index:
#     i = unsupervised_data.index.get_loc(key)
#     supervised_vectors[j] = unsupervised_vectors[i]
#     j += 1


    
all_sents.loc['dicky (n.)']


/home/trevor/anaconda3/envs/etym/lib/python3.5/site-packages/ipykernel/__main__.py:13: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/home/trevor/anaconda3/envs/etym/lib/python3.5/site-packages/scipy/sparse/compressed.py:730: SparseEfficiencyWarning: Changing the sparsity structure of a csr_matrix is expensive. lil_matrix is more efficient.
  SparseEfficiencyWarning)
Out[10]:
0       "detached shirt front," 1811; "a small bird," ...
code                                                   -1
Name: dicky (n.), dtype: object

In [ ]:

Use Scikit's semisupervised learning

There are two semisupervised methods that scikit has. Label Propagation and Label Spreading. The difference is in how they regularize.


In [23]:
num_points = 1000
num_test = 50

x = vstack([vectors[:num_points], supervised_vectors]).toarray()
t = all_sents['code'][:num_points].append(targets['code'])

x_test = x[-num_test:]
t_test = t[-num_test:]
x = x[:-num_test]
t = t[:-num_test]

label_prop_model = LabelSpreading(kernel='knn')
from time import time

print("fitting model")
timer_start = time()
label_prop_model.fit(x, t)
print("runtime: %0.3fs" % (time()-timer_start))


fitting model
runtime: 409.998s

In [24]:
print("done!")

# unsupervised_data['code'].iloc[:1000]


done!

In [11]:
import pickle 

# with open("classifiers/labelspreading_knn_all_but_100.pkl", 'bw') as writefile:
#     pickle.dump(label_prop_model, writefile)

In [25]:
import smtplib
 
server = smtplib.SMTP('smtp.gmail.com', 587)
server.starttls()
server.login("trevortds3@gmail.com", "Picardy3")
 
msg = "Job's done!"
server.sendmail("trevortds3@gmail.com", "trevortds@gmail.com", msg)
server.quit()


Out[25]:
(221, b'2.0.0 closing connection 17sm495479otj.30 - gsmtp')

In [15]:
targets


Out[15]:
0 code
keg (n.) Norse 4
Ganymede Greek 2
raw (adj.) English 0
handle (n.) English 0
cardamom (n.) French 1
bravo Other 5
wicket (n.) French 1
girandole (n.) French 1
deputize (v.) French 1
Cambodia Other 5
demeaning (adj.) English 0
fillet (v.) French 1
jeunesse doree (n.) French 1
concurring (adj.) Latin 3
transaction (n.) French 1
survival (n.) French 1
Angevin French 1
acme (n.) Greek 2
anamorphic (adj.) Greek 2
assortment (n.) French 1
noli me tangere Latin 3
Lernaean Latin 3
lark (v.) English 0
Toussaint (n.) French 1
marble (v.) French 1
wooly (adj.) English 0
serving (n.) French 1
hereafter (adv.) English 0
phi Greek 2
constable (n.) French 1
... ... ...
metric (adj.) French 1
clinic (n.) French 1
bracken (n.) Norse 4
over-excitement (n.) French 1
patho- Greek 2
atrophy (v.) French 1
gaydar (n.) English 0
wishbone (n.) English 0
latter (adv.) English 0
world war (n.) English 0
oocyte (n.) Greek 2
puddinghead (n.) English 0
naysayer (n.) French 1
externalize (v.) French 1
pyrotechnic (adj.) Greek 2
snowfall (n.) English 0
setter (n.) English 0
flageolet (n.) French 1
piracy (n.) Latin 3
Sahel Other 5
dulcimer (n.) French 1
whoever (pron.) English 0
geo- Greek 2
Mata Hari Other 5
rhotacism (n.) Latin 3
sparkle (n.) English 0
imbue (v.) Latin 3
empathetic (adj.) Greek 2
thermal (adj.) French 1
doorway (n.) English 0

350 rows × 2 columns

Measuring effectiveness.


In [26]:
from sklearn.metrics import precision_score, accuracy_score, f1_score, recall_score



t_pred = label_prop_model.predict(x_test)

print("Metrics based on 50 hold-out points")

print("Macro")
print("accuracy: %f" % accuracy_score(t_test, t_pred))
print("precision: %f" % precision_score(t_test, t_pred, average='macro'))
print("recall: %f" % recall_score(t_test, t_pred, average='macro'))
print("f1: %f" % f1_score(t_test, t_pred, average='macro'))
print("\n\nMicro")
print("accuracy: %f" % accuracy_score(t_test, t_pred))
print("precision: %f" % precision_score(t_test, t_pred, average='micro'))
print("recall: %f" % recall_score(t_test, t_pred, average='micro'))
print("f1: %f" % f1_score(t_test, t_pred, average='micro'))

from sklearn import metrics
import matplotlib.pyplot as pl

labels = ["English", "French", "Greek", "Latin","Norse", "Other"]
labels_digits = [0, 1, 2, 3, 4, 5]
cm = metrics.confusion_matrix(t_test, t_pred, labels_digits)

fig = pl.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(cm)
pl.title("Label Spreading with KNN kernel (k=7)")
fig.colorbar(cax)
ax.set_xticklabels([''] + labels)
ax.set_yticklabels([''] + labels)
pl.xlabel('Predicted')
pl.ylabel('True')

pl.show()


Metrics based on 100 hold-out points
Macro
accuracy: 0.220000
precision: 0.149285
recall: 0.175926
f1: 0.130974


Micro
accuracy: 0.220000
precision: 0.220000
recall: 0.220000
f1: 0.220000
/home/trevor/anaconda3/envs/etym/lib/python3.5/site-packages/sklearn/metrics/classification.py:1113: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
/home/trevor/anaconda3/envs/etym/lib/python3.5/site-packages/sklearn/metrics/classification.py:1113: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)

PCA: Let's see what it looks like

Performing PCA


In [11]:
supervised_vectors


Out[11]:
<350x10000 sparse matrix of type '<class 'numpy.int64'>'
	with 9602 stored elements in Compressed Sparse Row format>

In [13]:
import matplotlib.pyplot as pl

u, s, v = np.linalg.svd(supervised_vectors.toarray())
pca = np.dot(u[:,0:2], np.diag(s[0:2]))



In [15]:
english = np.empty((0,2))
french = np.empty((0,2))
greek = np.empty((0,2))
latin = np.empty((0,2))
norse = np.empty((0,2))
other = np.empty((0,2))

for i in range(pca.shape[0]):
    if targets[0].iloc[i] == "English":
        english = np.vstack((english, pca[i]))
    elif targets[0].iloc[i] == "French":
        french = np.vstack((french, pca[i]))
    elif targets[0].iloc[i] == "Greek":
        greek = np.vstack((greek, pca[i]))
    elif targets[0].iloc[i] == "Latin":
        latin = np.vstack((latin, pca[i]))
    elif targets[0].iloc[i] == "Norse":
        norse = np.vstack((norse, pca[i]))
    elif targets[0].iloc[i] == "Other":
        other = np.vstack((other, pca[i]))
        
pl.plot( english[:,0], english[:,1], "ro", 
          french[:,0],  french[:,1], "bs",
           greek[:,0],   greek[:,1], "g+",
           latin[:,0],   latin[:,1], "c^",
           norse[:,0],   norse[:,1], "mD",
           other[:,0],   other[:,1], "kx")
pl.axis([-5,0,-2, 5])
pl.show()



In [17]:
print (s)


[  5.91620224e+01   2.71903266e+01   2.25373321e+01   2.18548378e+01
   2.05293711e+01   2.02552813e+01   1.90754512e+01   1.85991696e+01
   1.81873986e+01   1.80134132e+01   1.72471862e+01   1.68548772e+01
   1.65085407e+01   1.62650006e+01   1.57953228e+01   1.54611529e+01
   1.50303486e+01   1.48647240e+01   1.42207480e+01   1.40830743e+01
   1.38100719e+01   1.36810377e+01   1.34711118e+01   1.29794841e+01
   1.29328253e+01   1.26212845e+01   1.25284216e+01   1.22711388e+01
   1.22234562e+01   1.20992262e+01   1.19344953e+01   1.18256517e+01
   1.16621712e+01   1.15169454e+01   1.11559128e+01   1.11172564e+01
   1.09965146e+01   1.07834035e+01   1.06354416e+01   1.05598111e+01
   1.05040540e+01   1.03044157e+01   1.00384970e+01   9.95060504e+00
   9.80957830e+00   9.62082994e+00   9.59783806e+00   9.55042494e+00
   9.40766998e+00   9.27471794e+00   9.12442460e+00   9.00725264e+00
   8.94174012e+00   8.89634153e+00   8.86327938e+00   8.73631683e+00
   8.63344898e+00   8.56110426e+00   8.43019754e+00   8.37597463e+00
   8.33290257e+00   8.27182220e+00   8.23673133e+00   8.12892926e+00
   8.05098156e+00   8.01639554e+00   7.89447701e+00   7.85036818e+00
   7.75353363e+00   7.70371203e+00   7.69309697e+00   7.59309821e+00
   7.53870605e+00   7.42894652e+00   7.36593410e+00   7.33917439e+00
   7.30685048e+00   7.23261164e+00   7.18562750e+00   7.16422048e+00
   7.07553708e+00   7.06647784e+00   7.00851735e+00   6.95249135e+00
   6.87706673e+00   6.86320591e+00   6.85258130e+00   6.73528203e+00
   6.71341256e+00   6.62620067e+00   6.61189382e+00   6.56551907e+00
   6.48725053e+00   6.47353990e+00   6.44774014e+00   6.42189497e+00
   6.34945126e+00   6.33635203e+00   6.29718001e+00   6.25692748e+00
   6.19701173e+00   6.17973520e+00   6.15567560e+00   6.12907840e+00
   6.08100714e+00   6.06000577e+00   6.03287749e+00   6.02933807e+00
   5.99912320e+00   5.97827993e+00   5.91807281e+00   5.89696472e+00
   5.85928901e+00   5.83465375e+00   5.81825110e+00   5.76785173e+00
   5.69884611e+00   5.68441884e+00   5.67886008e+00   5.66503104e+00
   5.64118308e+00   5.62685802e+00   5.62328862e+00   5.54333149e+00
   5.51022672e+00   5.50212372e+00   5.45635845e+00   5.42840128e+00
   5.41055251e+00   5.39485463e+00   5.37178423e+00   5.34700924e+00
   5.33119914e+00   5.30342689e+00   5.28174842e+00   5.26575457e+00
   5.23834984e+00   5.20694135e+00   5.19020006e+00   5.16171577e+00
   5.15307331e+00   5.11366306e+00   5.08951824e+00   5.08043932e+00
   5.05632805e+00   5.03951880e+00   5.02170609e+00   4.99330871e+00
   4.97455397e+00   4.96131269e+00   4.90657807e+00   4.90421133e+00
   4.88360745e+00   4.87805323e+00   4.84127903e+00   4.83383211e+00
   4.79793617e+00   4.79084463e+00   4.74435725e+00   4.72977315e+00
   4.72633702e+00   4.69921667e+00   4.67594537e+00   4.64762834e+00
   4.62283660e+00   4.61123143e+00   4.59333483e+00   4.55667800e+00
   4.55487119e+00   4.54252515e+00   4.51858068e+00   4.47135159e+00
   4.46751138e+00   4.44309841e+00   4.42274337e+00   4.41170017e+00
   4.38343403e+00   4.36344550e+00   4.34710623e+00   4.31680208e+00
   4.30357336e+00   4.29415007e+00   4.27494983e+00   4.25288321e+00
   4.24258225e+00   4.21292455e+00   4.18465889e+00   4.17666500e+00
   4.16848831e+00   4.13179108e+00   4.10662428e+00   4.09367116e+00
   4.08276950e+00   4.06962523e+00   4.05648628e+00   4.02270282e+00
   4.00157776e+00   3.98486000e+00   3.97521433e+00   3.95748808e+00
   3.94071036e+00   3.92832089e+00   3.92400654e+00   3.91596423e+00
   3.87691156e+00   3.86760988e+00   3.84023574e+00   3.83699536e+00
   3.80326425e+00   3.78249742e+00   3.75428849e+00   3.73462003e+00
   3.70710567e+00   3.69914220e+00   3.68744123e+00   3.66623141e+00
   3.65410376e+00   3.62376951e+00   3.61985574e+00   3.59945490e+00
   3.56853079e+00   3.54631093e+00   3.54045391e+00   3.52498211e+00
   3.49025551e+00   3.47486359e+00   3.46983201e+00   3.45639446e+00
   3.43247778e+00   3.42303538e+00   3.41622757e+00   3.38370225e+00
   3.36899284e+00   3.35735153e+00   3.33617643e+00   3.31651429e+00
   3.30893375e+00   3.28222948e+00   3.26545621e+00   3.25579743e+00
   3.23714510e+00   3.21889205e+00   3.19853411e+00   3.19054881e+00
   3.17404164e+00   3.16397576e+00   3.14724600e+00   3.12090826e+00
   3.09682950e+00   3.09109158e+00   3.08198848e+00   3.07408811e+00
   3.05489219e+00   3.00045712e+00   2.98562464e+00   2.97663690e+00
   2.96080758e+00   2.94171719e+00   2.92512140e+00   2.91339012e+00
   2.90456883e+00   2.89698888e+00   2.88588297e+00   2.87435774e+00
   2.84774494e+00   2.83427309e+00   2.82253109e+00   2.81218591e+00
   2.79407108e+00   2.78091722e+00   2.77164313e+00   2.75786737e+00
   2.73232181e+00   2.72205267e+00   2.68973589e+00   2.67816897e+00
   2.66768897e+00   2.65612647e+00   2.63527825e+00   2.62218115e+00
   2.60899033e+00   2.60179147e+00   2.58519333e+00   2.57398729e+00
   2.55319231e+00   2.54707318e+00   2.51525775e+00   2.50928206e+00
   2.49798107e+00   2.47752596e+00   2.46082405e+00   2.44789996e+00
   2.43781208e+00   2.42337148e+00   2.37877014e+00   2.36397765e+00
   2.34423063e+00   2.32521781e+00   2.30443806e+00   2.28767667e+00
   2.25974349e+00   2.25225851e+00   2.23069208e+00   2.20879433e+00
   2.19818227e+00   2.15452837e+00   2.13003021e+00   2.10114354e+00
   2.08408874e+00   2.06706333e+00   2.03305891e+00   2.00453592e+00
   1.99422035e+00   1.98176476e+00   1.96403012e+00   1.95435470e+00
   1.89485616e+00   1.85497403e+00   1.80593168e+00   1.75025015e+00
   1.72640568e+00   1.71779958e+00   1.71208774e+00   1.69306229e+00
   1.67024606e+00   1.65664206e+00   1.61255136e+00   1.59659905e+00
   1.48568398e+00   1.43589975e+00   1.36775590e+00   1.33071770e+00
   1.30151167e+00   1.28251284e+00   1.24081899e+00   1.22199771e+00
   1.20023909e+00   1.18018785e+00   1.15772257e+00   1.11618851e+00
   1.09146571e+00   1.06876030e+00   1.01863914e+00   9.89895736e-01
   6.77936796e-01   5.63668345e-01   2.62730405e-14   4.40975938e-15
   1.55993787e-15   1.48858648e-15]